Multivariate Analysis

Assignment 1

Author

Shubh Gaur - 23200555

Importing Libraries

# packages to import
packages <- c("tidyverse","reshape2","MASS","pls")

# Install and load packages
for (pkg in packages) {
  if (!require(pkg, character.only = TRUE)) {
    install.packages(pkg)
    library(pkg, character.only = TRUE)
  }
}

Loading the data

data <- read_csv("Temperature_data.csv",show_col_types = FALSE)
head(data,strict.width='cut')
# A tibble: 6 × 15
  Max1R13_1 T_RC1 T_LC1  RCC1 canthiMax1 T_FHCC1 T_FHLC1 T_FHTC1 T_OR1
      <dbl> <dbl> <dbl> <dbl>      <dbl>   <dbl>   <dbl>   <dbl> <dbl>
1      36.3  36.3  36.6  36.0       36.6    36.3    35.7    35.8  36.0
2      36.6  36.4  36.7  36.3       36.7    35.5    35.9    35.9  36.8
3      35.1  35.3  35.2  34.9       35.3    34.7    34.7    34.7  35.3
4      35.5  35.6  35.9  34.9       36.0    34.3    34.5    34.0  35.8
5      36.0  35.9  35.6  35.7       36.0    35.1    34.6    34.5  35.5
6      37.5  37.5  37.2  36.3       37.4    36.5    35.1    35.6  37.2
# ℹ 6 more variables: aveAllR13_1 <dbl>, aveOralM <dbl>, Gender <chr>,
#   Age <chr>, Ethnicity <chr>, pyrexic <dbl>
summary(data,strict.width='cut')
   Max1R13_1         T_RC1           T_LC1            RCC1      
 Min.   :33.35   Min.   :33.34   Min.   :33.78   Min.   :32.78  
 1st Qu.:35.23   1st Qu.:35.30   1st Qu.:35.29   1st Qu.:34.84  
 Median :35.58   Median :35.65   Median :35.64   Median :35.27  
 Mean   :35.73   Mean   :35.79   Mean   :35.76   Mean   :35.38  
 3rd Qu.:36.08   3rd Qu.:36.13   3rd Qu.:36.13   3rd Qu.:35.79  
 Max.   :38.52   Max.   :38.61   Max.   :38.16   Max.   :38.31  
                 NA's   :1                       NA's   :1      
   canthiMax1       T_FHCC1         T_FHLC1         T_FHTC1     
 Min.   :33.85   Min.   :30.02   Min.   :30.21   Min.   :27.70  
 1st Qu.:35.42   1st Qu.:34.27   1st Qu.:34.18   1st Qu.:34.21  
 Median :35.77   Median :34.85   Median :34.74   Median :34.74  
 Mean   :35.91   Mean   :34.93   Mean   :34.71   Mean   :34.69  
 3rd Qu.:36.25   3rd Qu.:35.60   3rd Qu.:35.25   3rd Qu.:35.27  
 Max.   :38.52   Max.   :38.18   Max.   :37.59   Max.   :37.94  
 NA's   :2       NA's   :4       NA's   :2       NA's   :2      
     T_OR1        aveAllR13_1       aveOralM        Gender         
 Min.   :32.18   Min.   :29.95   Min.   :33.90   Length:1237       
 1st Qu.:35.00   1st Qu.:34.41   1st Qu.:36.79   Class :character  
 Median :35.46   Median :34.96   Median :37.04   Mode  :character  
 Mean   :35.48   Mean   :34.99   Mean   :37.23                     
 3rd Qu.:35.94   3rd Qu.:35.49   3rd Qu.:37.49                     
 Max.   :37.75   Max.   :37.82   Max.   :40.34                     
 NA's   :2       NA's   :2                                         
     Age             Ethnicity            pyrexic      
 Length:1237        Length:1237        Min.   :0.0000  
 Class :character   Class :character   1st Qu.:0.0000  
 Mode  :character   Mode  :character   Median :0.0000  
                                       Mean   :0.1851  
                                       3rd Qu.:0.0000  
                                       Max.   :1.0000  
                                                       

Now, we will be sampling 1000 data points using random indexing from the original dataframe.

set.seed(23200555)
sample_size <- 1000
sample_indices <- sample(nrow(data),size=sample_size)
cat(paste(sample_indices[1:5]))
946 130 1181 669 314
cat(paste('\nLength of sample:',length(sample_indices)))

Length of sample: 1000

We have successfully sampled 1000 random indexes which we’ll be using for the data in consideration. Lets create a dataframe using the sampled indexes.

sample_data <- data[sample_indices,]
head(sample_data)
# A tibble: 6 × 15
  Max1R13_1 T_RC1 T_LC1  RCC1 canthiMax1 T_FHCC1 T_FHLC1 T_FHTC1 T_OR1
      <dbl> <dbl> <dbl> <dbl>      <dbl>   <dbl>   <dbl>   <dbl> <dbl>
1      35.5  35.5  35.4  35.3       35.5    34.0    34.3    34.2  35.0
2      37.9  38.0  37.9  37.3       37.9    36.0    36.1    36.1  36.8
3      35.8  36.0  35.9  35.7       36.1    34.8    35.0    34.8  35.7
4      36.3  36.5  36.4  36.1       36.4    34.7    35.0    34.8  35.8
5      38.0  38.0  37.9  37.3       37.9    36.0    36.1    36.1  36.8
6      35.2  35.2  35.4  34.9       35.4    34.8    35.0    35.1  34.6
# ℹ 6 more variables: aveAllR13_1 <dbl>, aveOralM <dbl>, Gender <chr>,
#   Age <chr>, Ethnicity <chr>, pyrexic <dbl>

Given that variable aveOralM contains the average of several oral temperature measurements and assumption is that it is the most accurate measure of temperature(°C).

Lets check if this variable contains any NA values.

any(is.na(sample_data$aveOralM))
[1] FALSE

As clear from the output this variable doesn’t have any NA values.

# Select the variables except the last 4
temperature_variables <- c("Max1R13_1", "T_RC1", "T_LC1", "RCC1", "canthiMax1", "T_FHCC1",
                           "T_FHLC1", "T_FHTC1", "T_OR1", "aveAllR13_1","aveOralM")
pairs(sample_data[,temperature_variables])

It is clear from the pairplot that there is high correlation among the variables in the dataset.

We will now be plotting boxplots for each observation to check if there are outlying values.

# Melt the data for easier plotting
melted_data <- melt(sample_data[, temperature_variables])

# Plot boxplots for each variable
ggplot(melted_data, aes(x = variable, y = value)) +
  geom_boxplot() +
  labs(x = "Variables", y = "Temperature(°C)") +
  theme_linedraw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Some outlier values are present in all the variables and since very low values of temperature indicate measurement errors. We’ll be removing all the datapoints where the oral temperature is below the mean by 4 standard deviations.

# Calculate mean and standard deviation for aveOralM
mean_oral_temp <- mean(sample_data$aveOralM, na.rm = TRUE)
sd_oral_temp <- sd(sample_data$aveOralM, na.rm = TRUE)

# Determine lower threshold
lower_threshold <- mean_oral_temp - 4 * sd_oral_temp

# Filter out observations where Oral_Temp is more than 4 standard deviations below the mean
filtered_data <- sample_data %>%
  filter(aveOralM >= lower_threshold)

# Melt the data for plotting
melted_data <- melt(filtered_data[, temperature_variables])

# Plot boxplots for each variable
ggplot(melted_data, aes(x = variable, y = value)) +
  geom_boxplot() +
  labs(x = "Variables", y = "Temperature(°C)") +
  theme_linedraw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

We have successfully removed measurement errors according to the given information.

Clustering

Dissimilarity matrix

We will be constructing the dissimilarity matrix using Euclidean distance. The reason for choosing euclidean distance as the dissimilarity measure is we want to see how different any two subjects are in terms of temperature values and the Euclidean distance between two data points would represent the overall difference between their temperature measurements across all temperature variables.

dist.max = dist(dplyr::select(filtered_data[,temperature_variables],-aveOralM), 
                method="euclidean")

Hierarchical Clustering

Now, we will be plotting the cluster dendrogram using complete linkage because of its property of considering maximum dissimilarity between clusters. This linkage strategy can efficiently group patients with similar temperature patterns together, enabling the identification of distinct clusters representing different temperature profiles.

cl.complete = hclust(dist.max, method="complete")
plot(cl.complete, xlab="Complete linkage", sub="")

Now, we will be cutting the dendrogram into two clusters. We are doing this because we want to see if subjects with elevated temperatures are clustered into a single unit and subjects whose temperature is not elevated are clustered together.

# adding a column which represents cluster
filtered_data$cluster_hierarchical <- cutree(cl.complete, k = 2)

# Create a data frame for plotting
plot_data <- filtered_data[,temperature_variables]
plot_data$cluster_hierarchical <- factor(filtered_data$cluster_hierarchical)

# Create scatter plots for each temperature variable
for (temp_var in temperature_variables) {
  if (temp_var!='aveOralM'){
  # Create a scatter plot with points colored based on cluster
  plot <- ggplot(plot_data, aes(x = aveOralM, y = .data[[temp_var]], 
                                color = cluster_hierarchical)) +
    geom_point() +
    # Add vertical line at 37.8 degrees Celsius
    geom_vline(xintercept = 37.8, linetype = "dashed", color = "black") + 
    labs(title = paste("Scatter Plot of", temp_var, "vs aveOralM"),
         x = "aveOralM", y = temp_var, color = "Cluster") +
    theme_minimal()
  
  # Print the plot
  print(plot)
  }
}

In summary, the selection of euclidean distance metric with complete linkage is a well-informed choice, driven by rigorous testing and tailored to the specific requirements of our problem domain. This combination offers robust cluster formations, preserves cluster heterogeneity, and optimizes the identification of temperature patterns associated with pyrexia, ultimately enhancing the effectiveness of our analysis.

K-means clustering

Let us implement another approach to screen patients using clustering which is k means clustering. K- means algorithm requires a dataframe which doesn’t have any na values. Therefore, before building our model we will omit all records if it has even a single na value.

filtered_data_na_removed <- na.omit(filtered_data)
n=nrow(filtered_data_na_removed)
WCSS = rep(0,10)
WCSS[1] = (n-1) * sum(apply(dplyr::select(filtered_data_na_removed
                                          [,temperature_variables],-aveOralM), 2, var))

for(k in 2:10)
{
WCSS[k] = sum(kmeans( dplyr::select(filtered_data_na_removed
                                  [,temperature_variables],-aveOralM), 
                      centers = k)$withinss )
}
plot(1:10, WCSS, type="b", xlab="k", ylab="Within group sum of squares")

According to the plot of within the cluster sum of squares for different number of clusters, we see a near stagnant error after \(k = 3\). Therefore,we select \(k = 3\) as the optimal number of clusters.

# Perform K-means clustering
kmeans_result <- kmeans(dplyr::select(filtered_data_na_removed
                                      [, temperature_variables], -aveOralM),
                        centers = 3)

# Adding a column which represents cluster
filtered_data_na_removed$cluster_kmeans <- kmeans_result$cluster

# Create a data frame for plotting
plot_data <-
  filtered_data_na_removed[, c(temperature_variables, "cluster_kmeans")]

# Create scatter plots for each temperature variable
for (temp_var in temperature_variables) {
  if (temp_var != 'aveOralM') {
    # Create a scatter plot with points colored based on cluster
    plot <-
      ggplot(plot_data, aes(x = aveOralM,y = .data[[temp_var]],
                            color = factor(cluster_kmeans))) +
      geom_point() +
      # Add vertical line at 37.8 degrees celsius
      geom_vline(xintercept = 37.8,linetype = "dashed",color = "black") +  
    labs(title = paste("Scatter Plot of", temp_var, "vs aveOralM"),
         x = "aveOralM",y = temp_var,color = "Cluster") +theme_minimal()
    # Print the plot
    print(plot)
  }
}

Let us now perform k means clustering with 2 clusters as according to our problem we need to screen subjects for elevated temperatures. So either their temperature can be elevated or it can be normal. So number of groups = 2.

# Perform K-means clustering
kmeans_result <- kmeans(dplyr::select(filtered_data_na_removed
                                      [, temperature_variables], -aveOralM), 
                        centers = 2)

# Adding a column which represents cluster
filtered_data_na_removed$cluster_kmeans <- kmeans_result$cluster

# Create a data frame for plotting
plot_data <- filtered_data_na_removed[, c(temperature_variables, "cluster_kmeans")]

# Create scatter plots for each temperature variable
for (temp_var in temperature_variables) {
    if (temp_var!='aveOralM'){
      # Create a scatter plot with points colored based on cluster
      plot <- ggplot(plot_data, aes(x = aveOralM, y = .data[[temp_var]], 
                                    color = factor(cluster_kmeans))) +
      geom_point() +
      # Add vertical line at 37.8 degrees Celsius
      geom_vline(xintercept = 37.8, linetype = "dashed", color = "black") +  
      labs(title = paste("Scatter Plot of", temp_var, "vs aveOralM"),
         x = "aveOralM", y = temp_var, color = "Cluster") +
      theme_minimal()
  
      # Print the plot
      print(plot)
    }
}

After reviewing the clustering patterns found using the two approaches used we find k-means with \(k = 2\) as a better solution for finding subjects with elevated temperatures because it has lesser overlap as compared to the hierarchical clustering solution.

Discriminant Analysis

Linear Discriminant Analysis

We will try to classify subjects based on gender utilising the facial temperature data with the help of LDA. Lets create a new dataframe using the existing filtered dataframe with facial temperature variables and gender. Note: We need to scale the dataframe also as its a necessity for LDA that each predictor has same variance.

# Considering only facial temperature variables, all variables
# starting from aveOralM are excluded
facial_temp_data <- filtered_data_na_removed %>%
  dplyr::select(-aveOralM:-cluster_kmeans) %>%
  scale() %>%
  as.data.frame()

#adding gender column and converting it to categorical
facial_temp_data$Gender <- factor(filtered_data_na_removed$Gender)
colnames(facial_temp_data)
 [1] "Max1R13_1"   "T_RC1"       "T_LC1"       "RCC1"        "canthiMax1" 
 [6] "T_FHCC1"     "T_FHLC1"     "T_FHTC1"     "T_OR1"       "aveAllR13_1"
[11] "Gender"     

The dataframe is successfully created with the required columns.

We have all the required data, now we can fit the model.

lda.res <- lda(Gender ~ ., data=facial_temp_data)
lda.res
Call:
lda(Gender ~ ., data = facial_temp_data)

Prior probabilities of groups:
  Female     Male 
0.560804 0.439196 

Group means:
        Max1R13_1      T_RC1      T_LC1       RCC1 canthiMax1    T_FHCC1
Female -0.2115087 -0.2061634 -0.1952521 -0.2101984 -0.2037018 -0.4540262
Male    0.2700729  0.2632475  0.2493151  0.2683998  0.2601044  0.5797405
          T_FHLC1    T_FHTC1      T_OR1 aveAllR13_1
Female -0.2314019 -0.1223553 -0.1642501  -0.2013869
Male    0.2954743  0.1562340  0.2097290   0.2571485

Coefficients of linear discriminants:
                     LD1
Max1R13_1    0.603623406
T_RC1       -1.010836015
T_LC1       -0.192641096
RCC1        -0.065528105
canthiMax1   0.566198190
T_FHCC1      2.939246475
T_FHLC1     -1.298080042
T_FHTC1     -0.814239404
T_OR1        0.008064875
aveAllR13_1 -0.110506440

Lets check the posterior probability of belonging to each class for first six observations.

predictions <- predict(lda.res)
head(predictions$posterior)
     Female       Male
1 0.9767409 0.02325911
2 0.7975273 0.20247272
3 0.9625590 0.03744103
4 0.9699389 0.03006109
5 0.7855903 0.21440967
6 0.8955196 0.10448045
#find accuracy of model
mean(predictions$class==facial_temp_data$Gender)
[1] 0.8874372

Our model has an accuracy of 88.7 %.

Let us perform LDA with cross validation on the data as we want to assess the misclassification rate to a better extent so we need an accurate measure.

lda.res.cv <- lda(Gender ~ ., CV=TRUE, data=facial_temp_data)
conf_mat <- table(lda.res.cv$class, facial_temp_data$Gender)
conf_mat
        
         Female Male
  Female    535   91
  Male       23  346

Lets check the misclassification rate.

mc_rate <- 1-sum(diag(conf_mat))/sum(conf_mat)
mc_rate
[1] 0.1145729

Our LDA classifier misclassified 11.46 % of the genders in the dataset.

Pending: Decision Boundary

coefficients <- coef(lda.res)

# Calculate LD1 values
ld1_values <- as.matrix(dplyr::select(facial_temp_data, -Gender)) %*% coefficients

# Combine LD1 values with group information
ld1_data <- data.frame(LD1 = ld1_values, Gender = facial_temp_data$Gender)

# Calculate mean LD1 values for each gender group
mean_ld1_female <- mean(subset(ld1_data, Gender == "Female")$LD1)
mean_ld1_male <- mean(subset(ld1_data, Gender == "Male")$LD1)

ggplot(ld1_data, aes(x = LD1, color = Gender)) +
  geom_density() +  
  geom_vline(xintercept = mean_ld1_female, color = "blue", linetype = "dashed") +
  geom_vline(xintercept = mean_ld1_male, color = "red", linetype = "dashed") +
  labs(x = "LD1", y = "Density", title = "Linear decision boundary plots for group gender")  +
  theme_minimal() +
  scale_color_manual(values = c("blue", "red")) +
  theme(legend.position = "top")

There is some overlap between the two gender groups as can be seen in the above plot. However, their distributions are quite different from each other which indicates there are two cluster formations(or two groups).

Quadratic Discriminant Analysis

We will now fit a QDA model on the same data.

qda.res <- qda(Gender ~ ., data=facial_temp_data)
qda.res
Call:
qda(Gender ~ ., data = facial_temp_data)

Prior probabilities of groups:
  Female     Male 
0.560804 0.439196 

Group means:
        Max1R13_1      T_RC1      T_LC1       RCC1 canthiMax1    T_FHCC1
Female -0.2115087 -0.2061634 -0.1952521 -0.2101984 -0.2037018 -0.4540262
Male    0.2700729  0.2632475  0.2493151  0.2683998  0.2601044  0.5797405
          T_FHLC1    T_FHTC1      T_OR1 aveAllR13_1
Female -0.2314019 -0.1223553 -0.1642501  -0.2013869
Male    0.2954743  0.1562340  0.2097290   0.2571485

Lets check the posterior probability of belonging to each class for first six observations.

predictions <- predict(qda.res)
head(predictions$posterior)
     Female        Male
1 0.9963211 0.003678892
2 0.7857122 0.214287778
3 0.9815919 0.018408071
4 0.9833097 0.016690306
5 0.8045936 0.195406353
6 0.8753949 0.124605132
#find accuracy of model
mean(predictions$class==facial_temp_data$Gender)
[1] 0.880402

Our model has an accuracy of 88%.

Let us perform QDA with cross validation on the data as we want to assess the misclassification rate to a better extent so we need an accurate measure.

qda.res.cv <- qda(Gender ~ ., CV=TRUE, data=facial_temp_data)
conf_mat <- table(qda.res.cv$class, facial_temp_data$Gender)
conf_mat
        
         Female Male
  Female    527   97
  Male       31  340

Lets check the misclassification rate.

mc_rate <- 1-sum(diag(conf_mat))/sum(conf_mat)
mc_rate
[1] 0.1286432

Our QDA classifier misclassified 12.86 % of the genders in the dataset.

Its clear from the performance metrics of both the models that LDA model outperforms QDA model in terms of accuracy and misclassification rate

Comment

Principal Component Analysis

We will now reduce the dimensionality of the data using a well known technique known as PCA.

pca <- prcomp(facial_temp_data |> dplyr::select(-Gender))
summary(pca)
Importance of components:
                          PC1     PC2     PC3     PC4     PC5     PC6    PC7
Standard deviation     2.8115 0.98341 0.61294 0.51351 0.46132 0.33568 0.2983
Proportion of Variance 0.7904 0.09671 0.03757 0.02637 0.02128 0.01127 0.0089
Cumulative Proportion  0.7904 0.88715 0.92472 0.95109 0.97237 0.98364 0.9925
                           PC8     PC9    PC10
Standard deviation     0.22754 0.11873 0.09371
Proportion of Variance 0.00518 0.00141 0.00088
Cumulative Proportion  0.99771 0.99912 1.00000
# compute cumulative proportion of variance
prop <- cumsum(pca$sdev^2) / sum(pca$sdev^2) 
plot(1:length(prop), prop, type = "b",
     xlab = "Number of Principal Components", 
     ylab = "Cumulative Proportion of Variance Explained",
     main = "Cumulative Proportion of Variance Explained by Principal Components")

From the PCA summary and the above plot its clear that the first principle component is able to explain much of the variance in the data. However, as the number of principal components included increases to 2 we are able to explain around 88% variation in data which is good enough.

Lets now derive the principal component score for each subject in the data using first principles.

cov_matrix <- cov(facial_temp_data |> dplyr::select(-Gender))

eigen <- eigen(cov_matrix)

sorted_indices <- order(eigen$values, decreasing = TRUE)

sorted_eigenvectors <- eigen$vectors[, sorted_indices]

pc_scores <-
  as.matrix(facial_temp_data |> dplyr::select(-Gender)) %*% sorted_eigenvectors

# Combine principal component scores into a single data frame
combined_scores <- data.frame(
  PC1_first_principles = pc_scores[, 1],
  PC2_first_principles = pc_scores[, 2],
  PC1_prcomp = pca$x[, 1],
  PC2_prcomp = pca$x[, 2]
)

# Plot combined principal component scores with legends for points
ggplot(combined_scores,
       aes(x = PC1_first_principles, y = PC2_first_principles)) +
  geom_point(aes(color = "First Principles"),shape = 16,alpha = 0.2) +
  geom_point(aes(x = PC1_prcomp, y = PC2_prcomp, color = "prcomp"),shape = 16,alpha = 0.3
  ) +
  labs(x = "Principal Component 1", y = "Principal Component 2", 
       title = "Superimposed Principal Component Scores") +
  scale_color_manual(values = c("First Principles" = "blue", "prcomp" = "red"),
    labels = c("First Principles", "prcomp")) +
  # Set legend marker shapes
  guides(color = guide_legend(override.aes = list(shape = c(16, 17)))) +
  theme_minimal()

It is clear from the above superimposed scatter plot of derived principal components using the first principles with principal components found using prcomp function that our computation for principal components is accurate.

Furthermore, we can see that the selected principal components do not show any sort of correlating pattern like we had seen earlier in the pairplot. Therefore, Reducing the dimensionality space enabled us to mitigate the high collinearity which was present in the data.

Principal Component Regression

Introduction

Principal Components Regression (PCR) as the name suggests is a technique that uses the same strategy as linear regression but with principal components. In this technique the principal components generated are used to model the response variable.

Purpose

This technique is widely used with data having high multicollinearity between predictor variables and its primary motive is to improve the stability and accuracy of regression model by reducing the dimensionality of the predictor space. It is considered a regularising regression strategy for highly multicollinear data.

Methodology

There are a sequence of steps which are essential when building PCR regression models which are listed below:

  1. Generate principle components from predictor variables to represent data.

  2. Select an appropriate value k i.e. the number of principal components to select. k can be determined using methods such as cross validation, information criteria(AIC,BIC) or based on cumulative proportion of variance explained by the k selected principal components.

  3. Once the appropriate number principal components are selected, a regression model is fitted using the k selected principal components against the response variable. Generally, multiple regression models are fitted with varying number of principal components and along with cross validation to determine the best k.

Choices in PCR

We need to make several choices to implement PCR which are listed below:

  1. Scaling of predictors : when predictor variables have different scales we need to standardize them to ensure that the principal components are not biased i.e different scales aren’t influencing the principal components.

  2. Number of principal components (k) : The choice of k decides the dimensionality of predictor space after PCA.As discussed above it can be determined using techniques like cross validation, information criteria(AIC/BIC), or a predefined number based on domain knowledge

  3. Evaluation of model : Model’s performance can be assessed after determining the appropriate number of principal components to select using techniques like cross validation/information criteria which is already covered in the previous point.

Advantages

  1. Multicollinearity alleviation : Predictor variables are transformed to principal components to address multicollinearity issues which improves the stability of regression models.

  2. Dimensionality Reduction : Since PCA is used, the dimensionality of the predictor space is reduced which leads to simpler models and enhanced computaional efficiency.

  3. Overfitting Reduction :Traditional regression models with highly correlated predictors are prone to overfitting because the model might capture noise in the data instead of the true underlying pattern.By reducing the predictor space, multicollinearity is reduced and consequently risk of overfitting is also reduced.

Disadvantages

  1. Loss of interpretability : Since the predictors are transformed with the help of PCA, understanding the relationship between original predictor variables and response becomes challenging due to the transformation of predictor variables which makes it harder to explain the practical implications of the findings of the model.

  2. Sensitive to outliers : Outliers may bias the construction of principal components, skewing the representation of predictor space which can further lead to distortions in model’s performance and compromise the reliability of the model.

  3. Computational Complexity : When there are large number of predictors, the computation of principal components becomes a highly intensive task which can pose challenges in terms of processing time and resource requirements particularly in real-time or resource constrained situations.

  4. Linearity Assumption : PCR assumes a linear relationship between predictors and response variables. If non-linear interactions are present then this technique may not capture the relationships properly and may yield suboptimal predictions.

PCR Implementation

We will now fit a PCR model on the facial temperature data to predict the oral temperature. Before doing that lets prepare appropriate training and testing sets for the same. We will be using a 70:30 train-test split.

pcr_data <- filtered_data_na_removed |> dplyr::select(all_of(temperature_variables)) |>
            scale() |> as.data.frame()

# Number of observations
n_obs <- nrow(pcr_data)

train_size <- floor(n_obs * 0.7)   # 70% for training
test_size <- floor(n_obs * 0.3)  # 30% for testing

#setting the seed for reproducibility
set.seed(23200555)

# Creating indices for train and test sets
train_indices <- sample(1:n_obs, train_size)
test_indices <- sample(setdiff(1:n_obs, train_indices), test_size)

Now, lets fit the PCR model on training data.

# Perform Principal Component Regression (PCR)
pcr_model <- pcr(aveOralM ~ ., data = pcr_data[train_indices,], scale = FALSE, 
                 validation = "CV")
summary(pcr_model)
Data:   X dimension: 696 10 
    Y dimension: 696 1
Fit method: svdpc
Number of components considered: 10

VALIDATION: RMSEP
Cross-validated using 10 random segments.
       (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
CV           1.005   0.5684   0.5255   0.5170   0.5208   0.5034   0.5041
adjCV        1.005   0.5682   0.5254   0.5168   0.5222   0.5031   0.5036
       7 comps  8 comps  9 comps  10 comps
CV      0.5047   0.5047   0.5020    0.5011
adjCV   0.5042   0.5042   0.5014    0.5005

TRAINING: % variance explained
          1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps
X           79.03    88.73    92.49    94.93    97.22    98.38    99.27
aveOralM    68.19    72.86    73.88    73.92    75.57    75.65    75.67
          8 comps  9 comps  10 comps
X           99.78    99.92    100.00
aveOralM    75.74    76.07     76.22

We will now use a validation plot to select the optimum number of principal components.

validationplot(pcr_model)

Its clear from the above plot that RMSE is not decreasing much after 2 principal components. Let us also visualize the predictive performance based on the \(R^2\) value.

validationplot(pcr_model,val.type = "R2")

We can see here as well that there is not much increase in predictive performance after 2nd principal component. Therefore the optimum number of principal components which can be used is 2. We will now make predictions on test data and see the performance metrics for the model.

# Predictions on the test set
predictions_test <- predict(pcr_model, newdata = pcr_data[test_indices,])

# Actual values from the test set
actual_values_test <- pcr_data$aveOralM[test_indices]

predictions_2comps <- predictions_test[, , 2]

# Calculate evaluation metrics for 2 components
rmse_2comp <- sqrt(mean((predictions_2comps - actual_values_test)^2))
mae_2comp <- mean(abs(predictions_2comps - actual_values_test))

# Calculate total sum of squares
mean_actual_values <- mean(actual_values_test)
SS_tot <- sum((actual_values_test - mean_actual_values)^2)
residuals_2comp <- actual_values_test - predictions_2comps

# Calculate R-squared for 2 components
SS_res_2comp <- sum(residuals_2comp^2)
r_squared_2comp <- 1 - (SS_res_2comp / SS_tot)
  
  
cat("Evaluation Metrics for 2 Components:\n")
Evaluation Metrics for 2 Components:
cat("RMSE:", round(rmse_2comp, 4), "\n")
RMSE: 0.5067 
cat("MAE:", round(mae_2comp, 4), "\n")
MAE: 0.3983 
cat("R-squared:", round(r_squared_2comp, 4), "\n")
R-squared: 0.7383 

The \(R^2\) value for model with 2 principal components on the testing set is around 0.74 which is in agreement with the training set as clear from the plots above. Therefore, the two principal components are able to explain around 74% variation in the aveOralM in the test set which can also be validated by looking at the plots and pcr model summary (TRAINING: % variance explained section).